In [1]:
from sklearn import model_selection, tree
import graphviz
import pandas as pd
import numpy as np
import re
import pydotplus 
from IPython.display import Image
from sklearn import model_selection, metrics
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('german.data.txt', sep=' ', header=None)
data.head()


Out[2]:
0 1 2 3 4 5 6 7 8 9 ... 11 12 13 14 15 16 17 18 19 20
0 A11 6 A34 A43 1169 A65 A75 4 A93 A101 ... A121 67 A143 A152 2 A173 1 A192 A201 1
1 A12 48 A32 A43 5951 A61 A73 2 A92 A101 ... A121 22 A143 A152 1 A173 1 A191 A201 2
2 A14 12 A34 A46 2096 A61 A74 2 A93 A101 ... A121 49 A143 A152 1 A172 2 A191 A201 1
3 A11 42 A32 A42 7882 A61 A74 2 A93 A103 ... A122 45 A143 A153 1 A173 2 A191 A201 1
4 A11 24 A33 A40 4870 A61 A73 3 A93 A101 ... A124 53 A143 A153 2 A173 2 A191 A201 2

5 rows × 21 columns

Вытащим из описания названия признаков и значения категориальных переменных.


In [3]:
with open('german.doc') as f:
    description = f.readlines()
properties_names = []
properties = {}

description = map(lambda s: s.strip(), description)

for i in xrange(len(description)):
    if re.match('Attr?ibute \d+', description[i]):
        i += 1
        properties_names += [description[i]]
    match = re.match('(A\d+) : (.*)', description[i])
    if match:
        properties[match.group(1)] = match.group(2)
properties_names += ['Give credit']

In [4]:
data.columns = properties_names
data.replace(properties, inplace=True)
print 'число признаков =', len(data.columns) - 1
data.head()


число признаков = 20
Out[4]:
Status of existing checking account Duration in month Credit history Purpose Credit amount Savings account/bonds Present employment since Installment rate in percentage of disposable income Personal status and sex Other debtors / guarantors ... Property Age in years Other installment plans Housing Number of existing credits at this bank Job Number of people being liable to provide maintenance for Telephone foreign worker Give credit
0 ... < 0 DM 6 critical account/ radio/television 1169 unknown/ no savings account .. >= 7 years 4 male : single none ... real estate 67 none own 2 skilled employee / official 1 yes, registered under the customers name yes 1
1 0 <= ... < 200 DM 48 existing credits paid back duly till now radio/television 5951 ... < 100 DM 1 <= ... < 4 years 2 female : divorced/separated/married none ... real estate 22 none own 1 skilled employee / official 1 none yes 2
2 no checking account 12 critical account/ education 2096 ... < 100 DM 4 <= ... < 7 years 2 male : single none ... real estate 49 none own 1 unskilled - resident 2 none yes 1
3 ... < 0 DM 42 existing credits paid back duly till now furniture/equipment 7882 ... < 100 DM 4 <= ... < 7 years 2 male : single guarantor ... if not A121 : building society savings agreement/ 45 none for free 1 skilled employee / official 2 none yes 1
4 ... < 0 DM 24 delay in paying off in the past car (new) 4870 ... < 100 DM 1 <= ... < 4 years 3 male : single none ... unknown / no property 53 none for free 2 skilled employee / official 2 none yes 2

5 rows × 21 columns

Теперь закодируем категориальные признаки, чтобы передать датасет дереву.


In [5]:
data_encode = pd.get_dummies(data)
print 'Теперь число признаков =', len(data_encode.columns) - 1
data_encode.head()


Теперь число признаков = 61
Out[5]:
Duration in month Credit amount Installment rate in percentage of disposable income Present residence since Age in years Number of existing credits at this bank Number of people being liable to provide maintenance for Give credit Status of existing checking account_ ... < 0 DM Status of existing checking account_ ... >= 200 DM / ... Housing_own Housing_rent Job_management/ self-employed/ Job_skilled employee / official Job_unemployed/ unskilled - non-resident Job_unskilled - resident Telephone_none Telephone_yes, registered under the customers name foreign worker_no foreign worker_yes
0 6 1169 4 4 67 2 1 1 1 0 ... 1 0 0 1 0 0 0 1 0 1
1 48 5951 2 2 22 1 1 2 0 0 ... 1 0 0 1 0 0 1 0 0 1
2 12 2096 2 3 49 1 2 1 0 0 ... 1 0 0 0 0 1 1 0 0 1
3 42 7882 2 4 45 1 2 1 1 0 ... 0 0 0 1 0 0 1 0 0 1
4 24 4870 3 4 53 2 2 2 1 0 ... 0 0 0 1 0 0 1 0 0 1

5 rows × 62 columns

Так же сделаем приведем target к классам 0 - не давать кредит, 1 - давать.


In [6]:
data_encode[u'Give credit'] = data_encode[u'Give credit'].apply(lambda x: 0 if x == 2 else 1)
y = data_encode[u'Give credit']
X = data_encode.drop((u'Give credit'), axis=1)

In [7]:
print 'Доли классов'
print '{} объектов 1-го класса'.format((y==1).sum() * 1./len(y))
print '{} объектов 0-го класса'.format((y==0).sum() * 1./len(y))


Доли классов
0.7 объектов 1-го класса
0.3 объектов 0-го класса

Построим дерево, получающееся при ограничении глубины не больше 2.


In [8]:
classifier = tree.DecisionTreeClassifier(max_depth=2)
classifier.fit(X, y)
dot_data = tree.export_graphviz(classifier, out_file="tree3.out", 
                         feature_names=X.columns,  
                         class_names=['credit', 'no credit'],  
                         filled=True, rounded=True,  
                         special_characters=False)  
graph = pydotplus.graphviz.graph_from_dot_file("tree3.out")  
Image(graph.create_png())


Out[8]:

Сначала разбиваем по размеру доходов и наличию счета. Если доходы большие или счета нет, то, если других обязательств нет - выдаем, иначе нет. В противном случае выдаем краткосрочные кредиты (меньше 22.5 месяцев).

Без ограничения


In [9]:
classifier = tree.DecisionTreeClassifier()
classifier.fit(X, y)
dot_data = tree.export_graphviz(classifier, out_file="tree.out", 
                         feature_names=X.columns,  
                         class_names=['credit', 'no credit'],  
                         filled=True, rounded=True,  
                         special_characters=False)  
graph = pydotplus.graphviz.graph_from_dot_file("tree.out")  
Image(graph.create_png())


Out[9]:

Теперь посмотрим на зависимость качества от глубины.


In [10]:
depths = np.arange(1, 30)
scores = []
train_score = []
X_train, y_train, X_test, y_test = model_selection.train_test_split(X, y, test_size=0.3)
for depth in depths:
    model = tree.DecisionTreeClassifier(max_depth=depth)
    scores += [model_selection.cross_val_score(model, X, y, scoring='roc_auc').mean()]
    model.fit(X, y)
    train_score += [metrics.roc_auc_score(y, model.predict(X))]

In [11]:
plt.figure(figsize=(10, 6))
plt.title('ROC_AUC score(depth)')
plt.plot(depths, scores, label='cross val score')
plt.plot(depths, train_score, label='train score')
plt.grid(True)
plt.xlabel('depth')
plt.ylabel('score')
plt.legend(loc='best')
plt.show()



In [12]:
print 'best depth =', np.argmax(scores) + 1


best depth = 3

Теперь посмотрим что будет, если работать с german-numeric, датасетом, в котором уже категориальные признаки закодированы.


In [13]:
with open('german_data-numeric.txt') as f:
    numeric_data = [map(int, line.strip().split()) for line in f.readlines()]
    
for i, x in enumerate(numeric_data):
    numeric_data[i] = np.array(x)
numeric_data = np.array(numeric_data)
numeric_data = pd.DataFrame(numeric_data)

In [14]:
numeric_data


Out[14]:
0 1 2 3 4 5 6 7 8 9 ... 15 16 17 18 19 20 21 22 23 24
0 1 6 4 12 5 5 3 4 1 67 ... 0 0 1 0 0 1 0 0 1 1
1 2 48 2 60 1 3 2 2 1 22 ... 0 0 1 0 0 1 0 0 1 2
2 4 12 4 21 1 4 3 3 1 49 ... 0 0 1 0 0 1 0 1 0 1
3 1 42 2 79 1 4 3 4 2 45 ... 0 0 0 0 0 0 0 0 1 1
4 1 24 3 49 1 3 3 4 4 53 ... 1 0 1 0 0 0 0 0 1 2
5 4 36 2 91 5 3 3 4 4 35 ... 0 0 1 0 0 0 0 1 0 1
6 4 24 2 28 3 5 3 4 2 53 ... 0 0 1 0 0 1 0 0 1 1
7 2 36 2 69 1 3 3 2 3 35 ... 0 1 1 0 1 0 0 0 0 1
8 4 12 2 31 4 4 1 4 1 61 ... 0 0 1 0 0 1 0 1 0 1
9 2 30 4 52 1 1 4 2 3 28 ... 1 0 1 0 0 1 0 0 0 2
10 2 12 2 13 1 2 2 1 3 25 ... 1 0 1 0 1 0 0 0 1 2
11 1 48 2 43 1 2 2 4 2 24 ... 0 0 1 0 1 0 0 0 1 2
12 2 12 2 16 1 3 2 1 3 22 ... 0 0 1 0 0 1 0 0 1 1
13 1 24 4 12 1 5 3 4 3 60 ... 1 0 1 0 0 1 0 1 0 2
14 1 15 2 14 1 3 2 4 3 28 ... 1 0 1 0 1 0 0 0 1 1
15 1 24 2 13 2 3 2 2 3 32 ... 0 0 1 0 0 1 0 1 0 2
16 4 24 4 24 5 5 3 4 2 53 ... 0 0 1 0 0 1 0 0 1 1
17 1 30 0 81 5 2 3 3 3 25 ... 0 0 1 0 0 1 0 0 1 1
18 2 24 2 126 1 5 2 2 4 44 ... 0 1 1 0 0 0 0 0 0 2
19 4 24 2 34 3 5 3 2 3 31 ... 0 0 1 0 0 1 0 0 1 1
20 4 9 4 21 1 3 3 4 3 48 ... 1 0 1 0 0 1 0 0 1 1
21 1 6 2 26 3 3 3 3 1 44 ... 0 0 1 0 1 0 0 0 1 1
22 1 10 4 22 1 2 3 3 1 48 ... 1 0 1 0 1 0 0 1 0 1
23 2 12 4 18 2 2 3 4 2 44 ... 0 1 1 0 0 1 0 0 1 1
24 4 10 4 21 5 3 4 1 3 26 ... 0 0 1 0 0 1 0 0 1 1
25 1 6 2 14 1 3 3 2 1 36 ... 0 0 1 0 0 1 0 1 0 1
26 4 6 0 4 1 5 4 4 3 39 ... 0 0 1 0 0 1 0 1 0 1
27 3 12 1 4 4 3 2 3 1 42 ... 0 0 1 0 1 0 0 0 1 1
28 2 7 2 24 1 3 3 2 1 34 ... 0 0 0 0 0 1 0 0 1 1
29 1 60 3 68 1 5 3 4 4 63 ... 0 0 1 0 0 1 0 0 1 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
970 2 15 2 15 2 3 3 2 1 22 ... 0 0 0 0 0 1 0 0 1 1
971 4 24 2 74 1 3 3 4 2 43 ... 1 0 1 0 0 1 0 1 0 1
972 1 24 1 12 1 1 2 4 4 29 ... 1 0 0 1 1 0 1 0 0 2
973 1 60 2 73 1 5 3 4 4 36 ... 0 0 0 1 1 0 0 0 1 2
974 4 30 4 28 1 3 2 2 3 33 ... 0 0 1 0 0 1 0 0 1 1
975 3 24 2 13 3 3 2 3 3 57 ... 0 0 1 0 0 1 0 1 0 1
976 2 6 2 8 1 3 2 3 1 64 ... 0 0 0 0 0 1 0 0 1 1
977 2 18 3 24 5 5 3 2 2 42 ... 0 0 1 0 0 1 0 0 1 1
978 4 24 3 25 1 5 3 4 3 47 ... 1 0 1 0 0 1 0 1 0 2
979 2 15 1 13 2 3 4 2 2 25 ... 1 0 1 0 1 0 0 0 1 2
980 2 30 4 84 1 4 3 2 2 49 ... 0 0 1 0 0 1 0 0 1 2
981 4 48 2 48 1 1 3 2 3 33 ... 0 0 1 0 1 0 0 0 0 2
982 3 21 2 29 2 3 2 1 3 28 ... 1 0 1 0 0 1 0 0 0 1
983 1 36 2 82 1 3 3 2 2 26 ... 0 1 1 0 0 1 0 0 1 2
984 4 24 4 20 1 4 3 2 2 30 ... 0 0 1 0 0 1 0 1 0 1
985 1 15 4 14 1 3 2 3 2 25 ... 0 0 1 0 1 0 0 0 1 1
986 3 42 0 63 1 2 1 1 2 33 ... 0 0 1 0 0 1 0 0 1 1
987 4 13 2 14 2 1 2 4 1 64 ... 0 0 1 0 0 1 0 0 1 1
988 1 24 2 66 1 1 3 2 4 29 ... 0 1 1 0 0 0 0 0 0 1
989 2 24 4 17 1 5 3 2 2 48 ... 0 0 1 0 0 1 0 1 0 1
990 4 12 4 36 5 2 3 1 2 37 ... 0 0 1 0 0 1 0 1 0 1
991 4 15 1 16 2 5 3 4 3 34 ... 0 0 1 0 0 1 0 1 0 1
992 1 18 2 19 5 4 4 4 3 23 ... 0 0 1 0 1 0 0 1 0 1
993 1 36 2 40 1 1 3 3 2 30 ... 0 0 1 0 0 1 0 0 0 1
994 4 12 2 24 5 5 3 3 3 50 ... 1 0 1 0 0 1 0 0 1 1
995 4 12 2 17 1 4 2 4 1 31 ... 0 0 1 0 0 1 0 1 0 1
996 1 30 2 39 1 3 1 4 2 40 ... 0 1 1 0 0 1 0 0 0 1
997 4 12 2 8 1 5 3 4 3 38 ... 0 0 1 0 0 1 0 0 1 1
998 1 45 2 18 1 3 3 4 4 23 ... 0 0 1 0 0 0 0 0 1 2
999 2 45 4 46 2 1 3 4 3 27 ... 0 1 1 0 0 1 0 0 1 1

1000 rows × 25 columns

Приведем target переменную от 1/2 к 1/0.


In [15]:
y = numeric_data.iloc[:, -1].apply(lambda x: 1 if x == 1 else 0)
X = numeric_data.iloc[:, :-1]

In [16]:
print 'Доли классов'
print '{} объектов 1-го класса'.format((y==1).sum() * 1./len(y))
print '{} объектов 0-го класса'.format((y==0).sum() * 1./len(y))


Доли классов
0.7 объектов 1-го класса
0.3 объектов 0-го класса

Построим дерево, получающееся при ограничении глубины не больше 2.


In [17]:
classifier = tree.DecisionTreeClassifier(max_depth=2)
classifier.fit(X, y)
dot_data = tree.export_graphviz(classifier, out_file="tree3.out", 
                         feature_names=X.columns,  
                         class_names=['credit', 'no credit'],  
                         filled=True, rounded=True,  
                         special_characters=False)  
graph = pydotplus.graphviz.graph_from_dot_file("tree3.out")  
Image(graph.create_png())


Out[17]:

Сначала разбиваем по размеру доходов и наличию счета. Если доходы большие или счета нет, то, если других обязательств нет - выдаем, иначе нет. В противном случае выдаем краткосрочные кредиты (меньше 22.5 месяцев).

Без ограничения


In [18]:
classifier = tree.DecisionTreeClassifier()
classifier.fit(X, y)
dot_data = tree.export_graphviz(classifier, out_file="tree.out", 
                         feature_names=X.columns,  
                         class_names=['credit', 'no credit'],  
                         filled=True, rounded=True,  
                         special_characters=False)  
graph = pydotplus.graphviz.graph_from_dot_file("tree.out")  
Image(graph.create_png())


Out[18]:

Теперь посмотрим на зависимость качества от глубины.


In [19]:
depths = np.arange(1, 30)
scores = []
train_score = []
X_train, y_train, X_test, y_test = model_selection.train_test_split(X, y, test_size=0.3)
for depth in depths:
    model = tree.DecisionTreeClassifier(max_depth=depth)
    scores += [model_selection.cross_val_score(model, X, y, scoring='roc_auc').mean()]
    model.fit(X, y)
    train_score += [metrics.roc_auc_score(y, model.predict(X))]

In [20]:
plt.figure(figsize=(10, 6))
plt.title('ROC_AUC score(depth)')
plt.plot(depths, scores, label='cross val score')
plt.plot(depths, train_score, label='train score')
plt.grid(True)
plt.xlabel('depth')
plt.ylabel('score')
plt.legend(loc='best')
plt.show()



In [21]:
print 'best depth =', np.argmax(scores) + 1


best depth = 4